{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "88d005d0",
   "metadata": {},
   "source": [
    "\n",
    "#### This notebook shows how one can use the OpenSearch Transform to create a new index based on the embedding found in a parquet file\n",
    "\n",
    "##### Prior to using this notebook, make sure you have an instance of OpenSearch running\n",
    "\n",
    "##### To run a temporary instance of OpenSeearch on a local machine:\n",
    "```\n",
    "    export OPENSEARCH_INITIAL_ADMIN_PASSWORD=R_${RANDOM}_s_$(date +%s)\n",
    "    echo \"OPENSEARCH_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD\" >> .env\n",
    "    docker compose -f docker-compose-default.3.2.0.yml up -d\n",
    "\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "56c62c9b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Obtaining file:///Users/touma/developer/data-prep-kit-pkg/data-processing-lib\n",
      "  Installing build dependencies ... \u001b[?25ldone\n",
      "\u001b[?25h  Checking if build backend supports build_editable ... \u001b[?25ldone\n",
      "\u001b[?25h  Getting requirements to build editable ... \u001b[?25ldone\n",
      "\u001b[?25h  Preparing editable metadata (pyproject.toml) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: numpy<2.0.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (1.26.4)\n",
      "Requirement already satisfied: pyarrow<=17.0.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (17.0.0)\n",
      "Requirement already satisfied: boto3<=1.38.18 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (1.38.18)\n",
      "Requirement already satisfied: mmh3 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (5.2.0)\n",
      "Requirement already satisfied: psutil in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (7.1.2)\n",
      "Requirement already satisfied: polars>=1.9.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (1.34.0)\n",
      "Requirement already satisfied: transformers in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (4.57.1)\n",
      "Requirement already satisfied: fasttext-wheel in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (0.9.2)\n",
      "Requirement already satisfied: huggingface-hub<1.0.0,>=0.21.4 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (0.36.0)\n",
      "Requirement already satisfied: python-json-logger in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from data_prep_toolkit==1.1.6.dev0) (4.0.0)\n",
      "Requirement already satisfied: botocore<1.39.0,>=1.38.18 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from boto3<=1.38.18->data_prep_toolkit==1.1.6.dev0) (1.38.46)\n",
      "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from boto3<=1.38.18->data_prep_toolkit==1.1.6.dev0) (1.0.1)\n",
      "Requirement already satisfied: s3transfer<0.13.0,>=0.12.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from boto3<=1.38.18->data_prep_toolkit==1.1.6.dev0) (0.12.0)\n",
      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from botocore<1.39.0,>=1.38.18->boto3<=1.38.18->data_prep_toolkit==1.1.6.dev0) (2.9.0.post0)\n",
      "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from botocore<1.39.0,>=1.38.18->boto3<=1.38.18->data_prep_toolkit==1.1.6.dev0) (2.5.0)\n",
      "Requirement already satisfied: filelock in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (3.20.0)\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (2025.3.0)\n",
      "Requirement already satisfied: packaging>=20.9 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (25.0)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (6.0.3)\n",
      "Requirement already satisfied: requests in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (2.32.5)\n",
      "Requirement already satisfied: tqdm>=4.42.1 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (4.67.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (4.15.0)\n",
      "Requirement already satisfied: hf-xet<2.0.0,>=1.1.3 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (1.2.0)\n",
      "Requirement already satisfied: six>=1.5 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.39.0,>=1.38.18->boto3<=1.38.18->data_prep_toolkit==1.1.6.dev0) (1.17.0)\n",
      "Requirement already satisfied: polars-runtime-32==1.34.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from polars>=1.9.0->data_prep_toolkit==1.1.6.dev0) (1.34.0)\n",
      "Requirement already satisfied: pybind11>=2.2 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from fasttext-wheel->data_prep_toolkit==1.1.6.dev0) (3.0.1)\n",
      "Requirement already satisfied: setuptools>=0.7.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from fasttext-wheel->data_prep_toolkit==1.1.6.dev0) (80.9.0)\n",
      "Requirement already satisfied: charset_normalizer<4,>=2 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from requests->huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (3.4.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from requests->huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (3.11)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from requests->huggingface-hub<1.0.0,>=0.21.4->data_prep_toolkit==1.1.6.dev0) (2025.10.5)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from transformers->data_prep_toolkit==1.1.6.dev0) (2025.10.23)\n",
      "Requirement already satisfied: tokenizers<=0.23.0,>=0.22.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from transformers->data_prep_toolkit==1.1.6.dev0) (0.22.1)\n",
      "Requirement already satisfied: safetensors>=0.4.3 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from transformers->data_prep_toolkit==1.1.6.dev0) (0.6.2)\n",
      "Building wheels for collected packages: data_prep_toolkit\n",
      "  Building editable for data_prep_toolkit (pyproject.toml) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for data_prep_toolkit: filename=data_prep_toolkit-1.1.6.dev0-0.editable-py3-none-any.whl size=4029 sha256=b70d1840e60af8cdb8ca558432a42fdc55e550a9aca5c88d75f09dcf0bd73ae4\n",
      "  Stored in directory: /private/var/folders/6c/yg49cqxx31g350x_fxysb_br0000gn/T/pip-ephem-wheel-cache-d34hp2a3/wheels/c1/64/50/0217fc2c96159602fbbf0d9d6f11b70b22d3354ce7b3f0ac94\n",
      "Successfully built data_prep_toolkit\n",
      "Installing collected packages: data_prep_toolkit\n",
      "  Attempting uninstall: data_prep_toolkit\n",
      "    Found existing installation: data_prep_toolkit 1.1.6.dev0\n",
      "    Uninstalling data_prep_toolkit-1.1.6.dev0:\n",
      "      Successfully uninstalled data_prep_toolkit-1.1.6.dev0\n",
      "Successfully installed data_prep_toolkit-1.1.6.dev0\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Requirement already satisfied: python-dotenv in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (1.2.1)\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "Requirement already satisfied: opensearch-py==3.0.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from -r requirements.txt (line 1)) (3.0.0)\n",
      "Requirement already satisfied: urllib3!=2.2.0,!=2.2.1,<3,>=1.26.19 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from opensearch-py==3.0.0->-r requirements.txt (line 1)) (2.5.0)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.32.0 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from opensearch-py==3.0.0->-r requirements.txt (line 1)) (2.32.5)\n",
      "Requirement already satisfied: python-dateutil in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from opensearch-py==3.0.0->-r requirements.txt (line 1)) (2.9.0.post0)\n",
      "Requirement already satisfied: certifi>=2024.07.04 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from opensearch-py==3.0.0->-r requirements.txt (line 1)) (2025.10.5)\n",
      "Requirement already satisfied: Events in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from opensearch-py==3.0.0->-r requirements.txt (line 1)) (0.5)\n",
      "Requirement already satisfied: charset_normalizer<4,>=2 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.32.0->opensearch-py==3.0.0->-r requirements.txt (line 1)) (3.4.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from requests<3.0.0,>=2.32.0->opensearch-py==3.0.0->-r requirements.txt (line 1)) (3.11)\n",
      "Requirement already satisfied: six>=1.5 in /Users/touma/developer/data-prep-kit-pkg/.venv/lib/python3.11/site-packages (from python-dateutil->opensearch-py==3.0.0->-r requirements.txt (line 1)) (1.17.0)\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "## If you have cloned the repo and running from local transform/opensearch folder: \n",
    "%pip install -e ../../../data-processing-lib/\n",
    "## Otherwise, pip install from pypi once this is released\n",
    "#%pip data-prep-toolkit-transforms[opensearch]\n",
    "%pip install python-dotenv\n",
    "%pip install -r requirements.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d7316d5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "opensearch_pwd=os.environ.get('OPENSEARCH_PASSWORD', None)\n",
    "if not opensearch_pwd:\n",
    "    # Load environment variables from .env file\n",
    "    load_dotenv()\n",
    "    opensearch_pwd=os.environ.get('OPENSEARCH_PASSWORD', None)\n",
    "\n",
    "assert opensearch_pwd, \"Must specify an opensearch password as an environment variable or in local .env file\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f3384d8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"OpenSearch parameters are : {'os_endpoint': 'localhost:9200', 'os_index': 'dpk_test_251110113229', 'os_document_id_column_name': 'document_id', 'os_embeddings_column_name': 'embeddings', 'os_dimension_size': None, 'os_content_column_name': 'contents', 'os_delete_index': False, 'os_disable_security': False, 'os_verify_certs': False, 'os_vector_method': None}\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"pipeline id pipeline_id\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"code location {'github': 'UNDEFINED', 'build-date': 'UNDEFINED', 'commit_hash': 'UNDEFINED', 'path': 'UNDEFINED'}\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"data factory data_ max_files -1, n_sample -1\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"data factory data_ Data Access:  DataAccessLocal\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"orchestrator os started at 2025-11-10 11:32:29\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"Number of files is 1, source profile {'max_file_size': 0.005139350891113281, 'min_file_size': 0.005139350891113281, 'total_file_size': 0.005139350891113281}\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"OpenSearch security is enabled\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"Transforming one table with 2 rows\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"Column embeddings exists, apply k-NN index\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"filename column is missing, add it with the value test1.parquet\"}\n",
      "{\"time\": \"11:32:29\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"index dpk_test_251110113229 created\"}\n",
      "{\"time\": \"11:32:30\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"Successfully indexed 2 documents into index dpk_test_251110113229 \"}\n",
      "{\"time\": \"11:32:30\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"Completed 1 files (100.0%) in 0.018 min\"}\n",
      "{\"time\": \"11:32:30\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"Done processing 1 files, waiting for flush() completion.\"}\n",
      "{\"time\": \"11:32:30\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"done flushing in 0.0 sec\"}\n",
      "{\"time\": \"11:32:30\", \"logger\": \"dpk\", \"logLevel\": \"INFO\", \"message\": \"Completed execution in 0.018 min, execution result 0\"}\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dpk_opensearch import OpenSearch\n",
    "from datetime import datetime\n",
    "test_index = f\"dpk_test_{datetime.now().strftime('%y%m%d%H%M%S')}\"\n",
    "\n",
    "OpenSearch(input_folder=\"test-data/input\",\n",
    "           output_folder=\"tmp\",\n",
    "           os_endpoint= 'localhost:9200',\n",
    "           os_index = test_index,\n",
    "           ).transform()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d30f99c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
